library(haven)
library(urltools)
library(rjson)
library(rvest)
# devtools::install_github("jayjacobs/tldextract")
library(tldextract)
library(tidyverse)


set.seed(pi + 4)

get_proper_domain <- function(x) {
  urltools::suffix_extract(urltools::url_parse(x)$domain)$domain
}

news <- read_csv(paste0("../results/political_urls.csv"))

survey <- read_sav("../data/November Survey/STAN0089_OUTPUT_all.sav")

survey <- survey %>%
  mutate(
    internet_primary = case_when(
    main_news_source_2_pre == 1 ~ 1,
    main_news_source_3_pre == 1 ~ 1,
    main_news_source_4_pre == 1 ~ 1,
    main_news_source_5_pre == 1 ~ 1,
    main_news_source_6_pre == 1 ~ 1,
    main_news_source_7_pre == 1 ~ 1,
    TRUE ~ 0)
    )


# I should just get all the variables Peterson et al. or Guess have
survey <- survey %>%
  mutate(
  caseid = as.integer(caseid),
  pid = pid7_pre %>% zap_labels() %>% na_if(8),
  party = case_when(
    pid %in% 1:3 ~ "dem",
    pid %in% 5:7 ~ "rep",
    TRUE ~ "ind") %>% as_factor(),
  income = faminc_pre %>% na_if(97),
  female = gender_pre,
  age = (2016 - birthyr_pre) %>% as.integer(), # survey was administered Nov. 2016
  educ = educ_pre,
  ideology = ideo5_pre %>% na_if(6),
  clinton = case_when(
    as_factor(presvote16_post) == "Hillary Clinton" ~ 1,
    TRUE ~ 0),
  trump = case_when(
    as_factor(presvote16_post) == "Donald Trump" ~ 1,
    TRUE ~ 0),
  voted_major = clinton + trump,
  abortion = Q26_pre,
  refugees = Q27_pre,
  primary = case_when(
    Q2_pre == 1 ~ 1,
    TRUE ~ 0),
  newsint = 1 * (Q31_pre == 1),
  race = case_when(
    race_pre == 1 ~ "White",
    race_pre == 2 ~ "Black",
    race_pre == 3 ~ "Hispanic",
    race_pre == 4 ~ "Asian",
    TRUE ~ "Other"
    ) %>% as_factor(),
  weight = weight_pulse %>% as.numeric(),
  ) %>% 
  select(caseid, pid, party, income, female, age, educ,
    ideology, clinton, trump, voted_major, refugees, abortion,
    primary, newsint, race, weight) %>% 
  as_factor() %>% 
  mutate_if(is.factor,  fct_drop)

# need to select some columns

news <- news %>% left_join(survey, by = "caseid")



foo <- news %>% group_by(caseid) %>%
  summarise(pfrac = mean(portal)) %>% 
  left_join(survey, by = "caseid")

foo <- foo %>% group_by(pid) %>%
  summarise(m = mean(pfrac),
    se = sd(pfrac) / sqrt(n())) %>% 
  mutate(lwr = m - 2 * se, upr = m + 2 * se)


pid_levels <- c("Strong DEM", "Weak DEM", "Lean DEM",
  "Pure IND", "Lean REP", "Weak REP", "Strong REP")

foo <- foo %>% mutate(pid = factor(case_when(
  pid == 1 ~ "Strong DEM",
  pid == 2 ~ "Weak DEM",
  pid == 3 ~ "Lean DEM",
  pid == 4 ~ "Pure IND",
  pid == 5 ~ "Lean REP",
  pid == 6 ~ "Weak REP",
  pid == 7 ~ "Strong REP"),
  levels = pid_levels)) %>% 
  filter(!is.na(pid))

p <- foo %>% ggplot(aes(pid, m, ymin = lwr, ymax = upr)) +
  geom_pointrange() +
  theme_classic() +
  scale_colour_grey() +
  scale_fill_grey() +
  labs(x = "Party ID", y = "Share of Political News from Portals")
ggsave("../results/figures/portal_pid.pdf", device = "pdf",
  width = 6, height = 4)



bakshy <- read_csv("../data/top500.csv") %>%
  select(domain, avg_align)
bakshy <- bakshy %>% mutate(
  domain = str_remove(domain, "www.")) %>% 
  distinct(domain, .keep_all = TRUE)

bakshy_domains <- bakshy$domain
proper_bakshy_domains <- bakshy$domain
bakshy_align <- bakshy$avg_align

find_bakshy_align <- function(url_list) {
  # the bakshy domains are not in a standard format
  # some have subdomains, and some don't
  # the plan is to check if the domain exists in that url,
  # and if there are multiple domains found choose the longest
  found <- map(url_list,
    ~ bakshy_domains[str_detect(., bakshy_domains)])

  # this is necessary to prevent cross-site domain
  # references from ruining algorithm
  for (i in 1:length(found)) {
    x <- found[[i]]
    actual_proper <- get_proper_domain(url_list[i])
    potential_proper <- get_proper_domain(x)
    found[[i]] <- x[potential_proper == actual_proper]
  }
  
  found <- map(found,
    ~ ifelse(identical(., character(0)), NA, .))
  found <- map_chr(found, ~
    ifelse(is.na(.), ., .[which.max(nchar(.))])) %>% 
    tibble(domain = .)
  found <- left_join(found, bakshy, by = "domain") %>%
    rename(bakshy_domain = domain)
  return(found)
}

news <- news %>% pull(url) %>% find_bakshy_align() %>%
  bind_cols(news, .)

news <- news %>% rename(b_align = avg_align)


news %>% write_csv("../results/merged_news.csv")